import numpy as np
import pandas as pd
import dalex as dx
import os
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import r2_score,mean_squared_error
from sklearn.ensemble import RandomForestRegressor
warnings.filterwarnings('ignore')
df = pd.read_csv('insurance.csv')
df.head()
| age | sex | bmi | children | smoker | region | charges | |
|---|---|---|---|---|---|---|---|
| 0 | 19 | female | 27.900 | 0 | yes | southwest | 16884.92400 |
| 1 | 18 | male | 33.770 | 1 | no | southeast | 1725.55230 |
| 2 | 28 | male | 33.000 | 3 | no | southeast | 4449.46200 |
| 3 | 33 | male | 22.705 | 0 | no | northwest | 21984.47061 |
| 4 | 32 | male | 28.880 | 0 | no | northwest | 3866.85520 |
Checking for nulls.
df.isnull().sum()
age 0 sex 0 bmi 0 children 0 smoker 0 region 0 charges 0 dtype: int64
No nulls.
Encoding categorical features.
#sex
le = LabelEncoder()
le.fit(df.sex.drop_duplicates())
df.sex = le.transform(df.sex)
# smoker or not
le.fit(df.smoker.drop_duplicates())
df.smoker = le.transform(df.smoker)
#region
le.fit(df.region.drop_duplicates())
df.region = le.transform(df.region)
Checking correlation.
df.corr()
| age | sex | bmi | children | smoker | region | charges | |
|---|---|---|---|---|---|---|---|
| age | 1.000000 | -0.020856 | 0.109272 | 0.042469 | -0.025019 | 0.002127 | 0.299008 |
| sex | -0.020856 | 1.000000 | 0.046371 | 0.017163 | 0.076185 | 0.004588 | 0.057292 |
| bmi | 0.109272 | 0.046371 | 1.000000 | 0.012759 | 0.003750 | 0.157566 | 0.198341 |
| children | 0.042469 | 0.017163 | 0.012759 | 1.000000 | 0.007673 | 0.016569 | 0.067998 |
| smoker | -0.025019 | 0.076185 | 0.003750 | 0.007673 | 1.000000 | -0.002181 | 0.787251 |
| region | 0.002127 | 0.004588 | 0.157566 | 0.016569 | -0.002181 | 1.000000 | -0.006208 |
| charges | 0.299008 | 0.057292 | 0.198341 | 0.067998 | 0.787251 | -0.006208 | 1.000000 |
A strong correlation is observed only with smoking
x = df.drop(['charges'], axis = 1)
y = df.charges
x_train,x_test,y_train,y_test = train_test_split(x,y, random_state = 0)
lr = LinearRegression().fit(x_train,y_train)
y_train_pred = lr.predict(x_train)
y_test_pred = lr.predict(x_test)
print(lr.score(x_test,y_test))
0.7962732059725786
forest = RandomForestRegressor(n_estimators = 100,
criterion = 'mse',
random_state = 1,
n_jobs = -1)
forest.fit(x_train,y_train)
forest_train_pred = forest.predict(x_train)
forest_test_pred = forest.predict(x_test)
print('MSE train data: %.3f, MSE test data: %.3f' % (
mean_squared_error(y_train,forest_train_pred),
mean_squared_error(y_test,forest_test_pred)))
print('R2 train data: %.3f, R2 test data: %.3f' % (
r2_score(y_train,forest_train_pred),
r2_score(y_test,forest_test_pred)))
MSE train data: 3729086.094, MSE test data: 19933823.142 R2 train data: 0.974, R2 test data: 0.873
idx = 269
slct = x_train.iloc[[idx]]
print("LinearRegression:")
print(lr.predict(slct))
print("Forest:")
print(forest.predict(slct))
print("correct:")
print(y_train.iloc[[idx]])
LinearRegression: [26452.86161694] Forest: [28985.049443] correct: 1196 33307.5508 Name: charges, dtype: float64
Creating LIME explainer
features = list(x.columns)
from lime import lime_tabular
explainer = lime_tabular.LimeTabularExplainer(x_train.to_numpy(), mode="regression", feature_names= features)
explainer
<lime.lime_tabular.LimeTabularExplainer at 0x1e8e5695af0>
explanation_lr = explainer.explain_instance(x_test.to_numpy()[idx,0:6], lr.predict, num_features=len(features))
explanation_lr
<lime.explanation.Explanation at 0x1e8e1667610>
explanation_lr.show_in_notebook()
with plt.style.context("ggplot"):
explanation_lr.as_pyplot_figure()
explanation_rf = explainer.explain_instance(x_test.to_numpy()[idx,0:6], forest.predict, num_features=len(features))
explanation_rf.show_in_notebook()
with plt.style.context("ggplot"):
explanation_rf.as_pyplot_figure()
idx1=200
idx2=300
idx3=2
print("LinearRegression:")
print(lr.predict(x_test.iloc[[idx1]]))
print(lr.predict(x_test.iloc[[idx2]]))
print(lr.predict(x_test.iloc[[idx3]]))
print("RandomForest")
print(forest.predict(x_test.iloc[[idx1]]))
print(forest.predict(x_test.iloc[[idx2]]))
print(forest.predict(x_test.iloc[[idx3]]))
print("correct:")
print(y_test.iloc[[idx1]])
print(y_test.iloc[[idx2]])
print(y_test.iloc[[idx3]])
LinearRegression: [13147.1222075] [3038.43915783] [38027.18625354] RandomForest [12187.0625795] [2357.1724826] [45243.2560303] correct: 1110 11512.405 Name: charges, dtype: float64 195 1639.5631 Name: charges, dtype: float64 569 45702.02235 Name: charges, dtype: float64
explanation_lr1 = explainer.explain_instance(x_test.to_numpy()[idx1,0:6], lr.predict, num_features=len(features))
explanation_lr2 = explainer.explain_instance(x_test.to_numpy()[idx2,0:6], lr.predict, num_features=len(features))
explanation_lr3 = explainer.explain_instance(x_test.to_numpy()[idx3,0:6], lr.predict, num_features=len(features))
Observation #1
explanation_lr1.show_in_notebook()
with plt.style.context("ggplot"):
explanation_lr1.as_pyplot_figure()
Observation #2
explanation_lr2.show_in_notebook()
with plt.style.context("ggplot"):
explanation_lr2.as_pyplot_figure()
Observation #3
explanation_lr3.show_in_notebook()
with plt.style.context("ggplot"):
explanation_lr3.as_pyplot_figure()
explanation_rf1 = explainer.explain_instance(x_test.to_numpy()[idx1,0:6], forest.predict, num_features=len(features))
explanation_rf2 = explainer.explain_instance(x_test.to_numpy()[idx2,0:6], forest.predict, num_features=len(features))
explanation_rf3 = explainer.explain_instance(x_test.to_numpy()[idx3,0:6], forest.predict, num_features=len(features))
Observation #1
explanation_rf1.show_in_notebook()
with plt.style.context("ggplot"):
explanation_rf1.as_pyplot_figure()
Observation #2
explanation_rf2.show_in_notebook()
with plt.style.context("ggplot"):
explanation_rf2.as_pyplot_figure()
Observation #3
explanation_rf3.show_in_notebook()
with plt.style.context("ggplot"):
explanation_rf3.as_pyplot_figure()
The explanations seem to be stable. Not being a smoker always decreases the charges, no matter what the age is. Moreover lower age decreases the predicted crages and medium BMI have little to no impact on the prediction, however these features have much less of an impact on the prediction the smoking.